1 package org.apache.lucene.search.spell;
2
3 /*
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import org.apache.lucene.util.IntsRef;
21
22 /**
23 * Damerau-Levenshtein (optimal string alignment) implemented in a consistent
24 * way as Lucene's FuzzyTermsEnum with the transpositions option enabled.
25 *
26 * Notes:
27 * <ul>
28 * <li> This metric treats full unicode codepoints as characters
29 * <li> This metric scales raw edit distances into a floating point score
30 * based upon the shortest of the two terms
31 * <li> Transpositions of two adjacent codepoints are treated as primitive
32 * edits.
33 * <li> Edits are applied in parallel: for example, "ab" and "bca" have
34 * distance 3.
35 * </ul>
36 *
37 * NOTE: this class is not particularly efficient. It is only intended
38 * for merging results from multiple DirectSpellCheckers.
39 */
40 public final class LuceneLevenshteinDistance implements StringDistance {
41
42 /**
43 * Creates a new comparator, mimicing the behavior of Lucene's internal
44 * edit distance.
45 */
46 public LuceneLevenshteinDistance() {}
47
48 @Override
49 public float getDistance(String target, String other) {
50 IntsRef targetPoints;
51 IntsRef otherPoints;
52 int n;
53 int d[][]; // cost array
54
55 // NOTE: if we cared, we could 3*m space instead of m*n space, similar to
56 // what LevenshteinDistance does, except cycling thru a ring of three
57 // horizontal cost arrays... but this comparator is never actually used by
58 // DirectSpellChecker, it's only used for merging results from multiple shards
59 // in "distributed spellcheck", and it's inefficient in other ways too...
60
61 // cheaper to do this up front once
62 targetPoints = toIntsRef(target);
63 otherPoints = toIntsRef(other);
64 n = targetPoints.length;
65 final int m = otherPoints.length;
66 d = new int[n+1][m+1];
67
68 if (n == 0 || m == 0) {
69 if (n == m) {
70 return 0;
71 }
72 else {
73 return Math.max(n, m);
74 }
75 }
76
77 // indexes into strings s and t
78 int i; // iterates through s
79 int j; // iterates through t
80
81 int t_j; // jth character of t
82
83 int cost; // cost
84
85 for (i = 0; i<=n; i++) {
86 d[i][0] = i;
87 }
88
89 for (j = 0; j<=m; j++) {
90 d[0][j] = j;
91 }
92
93 for (j = 1; j<=m; j++) {
94 t_j = otherPoints.ints[j-1];
95
96 for (i=1; i<=n; i++) {
97 cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
98 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
99 d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
100 // transposition
101 if (i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
102 d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
103 }
104 }
105 }
106
107 return 1.0f - ((float) d[n][m] / Math.min(m, n));
108 }
109
110 private static IntsRef toIntsRef(String s) {
111 IntsRef ref = new IntsRef(s.length()); // worst case
112 int utf16Len = s.length();
113 for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
114 cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
115 }
116 return ref;
117 }
118 }